Effect of cohesin looping on genome architecture, from the perspective of the nuclear lamina.
We have measured NL interactions in mESC during CTCF depletion. This assumes that CTCF peaks are enriched at LAD borders and might be involved in the formation of LADs. However, can I confirm that CTCF sites are equally enriched in mESC cells compared to other cell lines?
Calculate density of CTCF sites from LAD borders.
Load the libraries and set the parameters.
# Load dependencies
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggbeeswarm))
# # Prepare output
output_dir <- "ts220113_CTCF_enrichment_at_LAD_borders"
dir.create(output_dir, showWarnings = FALSE)
# Prepare bin size and scaling
bin.size <- bin.size.txt <- "10kb"
bin.size <- as.numeric(gsub("kb", "", bin.size)) * 1e3
scaling = 1e6 / bin.sizelibrary(knitr)
opts_chunk$set(fig.width = 10, fig.height = 4,
message = F, warning = F,
dev=c('png', 'pdf'), fig.path = file.path(output_dir, "figures/"))
pdf.options(useDingbats = FALSE)List functions.
LoadBed <- function(metadata, column, reduce = F, size_filter = F) {
# Load LADs as GRangesList from metadata object
bed <- GRangesList()
for (i in 1:nrow(metadata)) {
f.name <- (metadata %>% pull(column))[i]
f.import <- import(f.name)
if (reduce) {
f.import <- GenomicRanges::reduce(f.import, min.gapwidth = 50e3)
}
if (size_filter) {
f.import <- f.import[width(f.import) > 50e3]
}
f.import$cell <- metadata$cell[i]
f.import <- f.import[seqnames(f.import) %in% c(paste0("chr", 1:22), "chrX")]
bed <- c(bed, GRangesList(f.import))
}
names(bed) <- metadata$cell
bed
}
LADBorders <- function(LADs, bins, min.distance = 0) {
# Given a (named) GRangesList of LADs, return a GRangesList with borders
# Strand information defines left (+) or right (-) border
# Also, require complete bin object to determine chromosome start and end
cells <- names(LADs)
LAD.borders <- GRangesList()
for (cell in cells ) {
# Get LADs and bins for cell
cell.LADs <- LADs[[cell]]
cell.bins <- bins[[cell]]
# Remove small iLADs and remove small LADs
cell.LADs <- GenomicRanges::reduce(cell.LADs, min.gapwidth = min.distance)
cell.LADs <- cell.LADs[width(cell.LADs) > min.distance]
# Get borders
cell.borders <- c(GRanges(seqnames = seqnames(cell.LADs),
ranges = IRanges(start = start(cell.LADs),
end = start(cell.LADs)),
strand = "+"),
GRanges(seqnames = seqnames(cell.LADs),
ranges = IRanges(start = end(cell.LADs),
end = end(cell.LADs)),
strand = "-"))
cell.borders <- sort(cell.borders, ignore.strand = T)
# Get start and end of the chromosome and filter overlapping borders
chromosome.ends <- c(cell.bins[! duplicated(as.character(seqnames(cell.bins)))],
rev(cell.bins)[! duplicated(as.character(seqnames(rev(cell.bins))))])
chromosome.ends <- flank(chromosome.ends, 5000, both = T)
cell.borders <- cell.borders[! overlapsAny(cell.borders,
chromosome.ends,
ignore.strand = T)]
cell.borders$cell <- cell
LAD.borders <- c(LAD.borders, GRangesList(cell.borders))
}
names(LAD.borders) <- cells
LAD.borders
}
grMid = function(gr) {
start(gr) <- end(gr) <- rowMeans(cbind(start(gr), end(gr)))
gr
}
CTCFDistance <- function(sites, LAD.borders, nearest = T, border.class = F) {
# Given (named) GRangesLists of CTCF sites and LAD borders, calculate the
# distance to the preceding and following LAD. Return a new GRangesList
cells <- names(sites)
# Prepare holders
sites.new <- sites
# Loop over cells
for (cell in cells) {
# Get cell objects
cell.CTCF <- grMid(sites[[cell]])
cell.borders <- LAD.borders[[cell]]
# Make sure the chromosomes are as they should be - especially for the bins
cell.CTCF <- cell.CTCF[seqnames(cell.CTCF) %in% c(paste0("chr", 1:22),
"chrX")]
cell.borders <- cell.borders[seqnames(cell.borders) %in% c(paste0("chr", 1:22),
"chrX")]
# Make sure that a cell column is present - especially for the bins
cell.CTCF$cell <- cell
# Preceding distance
cell.CTCF$dis.precede <- cell.CTCF$strand.precede <- cell.CTCF$border.class.precede <- NA
idx.precede <- precede(cell.CTCF, cell.borders, ignore.strand = T, select = "all")
cell.CTCF$dis.precede[queryHits(idx.precede)] <- distance(cell.CTCF[queryHits(idx.precede)],
cell.borders[subjectHits(idx.precede)],
ignore.strand = T)
cell.CTCF$strand.precede[queryHits(idx.precede)] <- strand(cell.borders[subjectHits(idx.precede)])
if (border.class) {
cell.CTCF$border.class.precede[queryHits(idx.precede)] <- cell.borders$class[subjectHits(idx.precede)]
}
# Following distance
cell.CTCF$dis.follow <- cell.CTCF$strand.follow <- cell.CTCF$border.class.follow <- NA
idx.follow <- follow(cell.CTCF, cell.borders, ignore.strand = T, select = "all")
cell.CTCF$dis.follow[queryHits(idx.follow)] <- distance(cell.CTCF[queryHits(idx.follow)],
cell.borders[subjectHits(idx.follow)],
ignore.strand = T)
cell.CTCF$strand.follow[queryHits(idx.follow)] <- strand(cell.borders[subjectHits(idx.follow)])
if (border.class) {
cell.CTCF$border.class.follow[queryHits(idx.follow)] <- cell.borders$class[subjectHits(idx.follow)]
}
# Exception: overlapping (follow = distance (0), precede = NA)
idx.overlap <- findOverlaps(cell.CTCF, cell.borders, ignore.strand = T)
cell.CTCF$dis.follow[queryHits(idx.overlap)] <- distance(cell.CTCF[queryHits(idx.overlap)],
cell.borders[subjectHits(idx.overlap)],
ignore.strand = T)
cell.CTCF$dis.precede[queryHits(idx.overlap)] <- NA
cell.CTCF$strand.follow[queryHits(idx.overlap)] <- cell.CTCF$strand.precede[queryHits(idx.overlap)] <-
strand(cell.borders[subjectHits(idx.overlap)])
if (border.class) {
cell.CTCF$border.class.precede[queryHits(idx.overlap)] <-
cell.CTCF$border.class.follow[queryHits(idx.overlap)] <-
cell.borders$class[subjectHits(idx.overlap)]
}
# Alternative: only use information from the nearest hit
if (nearest) {
# Remove precede information if follow is smaller
idx.remove.precede <- which(cell.CTCF$dis.follow < cell.CTCF$dis.precede)
cell.CTCF$dis.precede[idx.remove.precede] <- NA
# Remove follow information if precede is smaller
idx.remove.follow <- which(cell.CTCF$dis.follow > cell.CTCF$dis.precede)
cell.CTCF$dis.follow[idx.remove.follow] <- NA
}
# Update object
sites.new[[cell]] <- cell.CTCF
}
sites.new
}
CountPerBins <- function(sites, sample_name, bin.size = 5000, border.class = F) {
# Determine count of CTCF sites per genomic bins
if (border.class == F) {
tib <- as_tibble(unlist(sites)) %>%
mutate(cell = factor(cell, levels = unique(cell))) %>%
add_column(number = 1:nrow(.)) %>%
dplyr::select(number, cell, dis.precede, dis.follow, strand.precede, strand.follow) %>%
mutate(dis.precede.group = as.numeric(cut(dis.precede,
breaks = seq(0, max(dis.precede, na.rm = T) + 1,
by = bin.size))) - 1,
dis.follow.group = as.numeric(cut(dis.follow,
breaks = seq(0, max(dis.follow, na.rm = T) + 1,
by = bin.size))) - 1) %>%
dplyr::select(-dis.precede, -dis.follow) %>%
mutate(dis.precede.group = ifelse(strand.precede == "+",
-dis.precede.group, dis.precede.group),
dis.follow.group = ifelse(strand.follow == "+",
dis.follow.group, -dis.follow.group)) %>%
dplyr::select(-strand.precede, -strand.follow) %>%
gather(key, value, -number, -cell) %>%
group_by(cell, value) %>%
dplyr::summarise(count = n()) %>%
ungroup() %>%
rename(count = sample_name)
} else {
tib <- as_tibble(unlist(sites)) %>%
mutate(cell = factor(cell, levels = unique(cell))) %>%
add_column(number = 1:nrow(.)) %>%
dplyr::select(number, cell, dis.precede, dis.follow, strand.precede, strand.follow,
border.class.precede, border.class.follow) %>%
mutate(dis.precede.group = as.numeric(cut(dis.precede,
breaks = seq(0, max(dis.precede, na.rm = T) + 1,
by = bin.size))) - 1,
dis.follow.group = as.numeric(cut(dis.follow,
breaks = seq(0, max(dis.follow, na.rm = T) + 1,
by = bin.size))) - 1) %>%
dplyr::select(-dis.precede, -dis.follow) %>%
mutate(dis.precede.group = ifelse(strand.precede == "+",
-dis.precede.group, dis.precede.group),
dis.follow.group = ifelse(strand.follow == "+",
dis.follow.group, -dis.follow.group)) %>%
dplyr::select(-strand.precede, -strand.follow) %>%
gather(key, value, -number, -cell, -border.class.precede, -border.class.follow) %>%
drop_na() %>%
rowwise() %>%
mutate(border.class = ifelse(key == "dis.precede.group",
border.class.precede, border.class.follow)) %>%
ungroup() %>%
dplyr::select(-border.class.precede, -border.class.follow) %>%
group_by(cell, value, border.class) %>%
dplyr::summarise(count = n()) %>%
ungroup() %>%
rename(count = sample_name)
}
tib
}
ClassifyLADBorders <- function(borders, sites, LADs, overlapping = F,
cells = levels(metadata$cell),
ranges = c(0, 20000), col_name = "CTCF") {
# Classify LAD borders into with and without CTCF
for (cell in cells) {
# Get cell objects
cell.borders <- borders[[cell]]
cell.sites <- sites[[cell]]
cell.LADs <- LADs[[cell]]
# Remove overlapping sites
if (! overlapping) cell.sites <- cell.sites[! overlapsAny(cell.sites, cell.LADs)]
# Get LAD borders with CTCF peaks within the given range
ovl <- distanceToNearest(cell.borders, cell.sites)
# Classify borders
mcols(cell.borders)[, paste0(col_name, ".distance")] <- mcols(ovl)$distance
mcols(cell.borders)[, col_name] <- ifelse(mcols(ovl)$distance >= ranges[1] &
mcols(ovl)$distance <= ranges[2],
"CTCF", "nonCTCF")
# Also, add the number of CTCF sites within the range
# For this, determine the distance from CTCF site to LAD border instead
ovl <- as_tibble(distanceToNearest(cell.sites, cell.borders)) %>%
filter(distance >= ranges[1] & distance <= ranges[2]) %>%
group_by(subjectHits) %>%
dplyr::summarise(n = n())
mcols(cell.borders)[, paste0(col_name, ".count")] <- 0
mcols(cell.borders)[ovl$subjectHits, paste0(col_name, ".count")] <- ovl$n
# Update
borders[[cell]] <- cell.borders
}
borders
}
GetBorderDifferences <- function(borders, min.distance = 1e5) {
# Differentiate between "shared" and "unique" borders, reflecting fLADs and cLADs
# Get all borders
borders.all <- unlist(borders)
# For every cell, find "unique" borders
cells <- names(borders)
for (cell in cells) {
# Get cell objects
borders.cell <- borders[[cell]]
borders.others <- borders.all[borders.all$cell != cell]
# Get the distance to nearest
dis <- distanceToNearest(borders.cell, borders.others)
# Classify borders
borders.cell$class <- ifelse(mcols(dis)$distance >= min.distance,
"unique", "shared")
# Update object
borders[[cell]] <- borders.cell
}
borders
}I will perform this analysis in multiple cell types in which the data (DamID + CTCF ChIP) is available. These are:
Note that some analyses include serum mESC cells with old DamID data. This is not included in the corresponding story / manuscript. The results in these cells are not different from the other cell type. We reasoned that addition of these cells only complicate the story and do not add anything.
For this analysis, I will use 10kb LAD calls from DamID data. I will also do this for the pA-DamID data, to validate that the different data shows the same trends.
#######################################
## Prepare metadata
metadata <- as_tibble(t(data.frame(
# mESC
c("mESC",
"/DATA/scratch/usr/t.v.schaik/proj/3D_nucleus/results/ts180110_4DN_DataProcessing/results_mouse/HMM/bin-10kb/mESC_LMNB1-10kb-combined_AD.bed.gz",
"/DATA/scratch/usr/t.v.schaik/proj/tests/results/ts181120_pADamID_mouse/analysis_CTCF_AID/Data_NQ/ChIP_NQ/CohesinFactors/2_Wapl-0D-antiCtcf_sampleOnly_peaks.narrowPeak"),
# "ts200605_CTCF_enrichment_at_LAD_borders/mESC_CTCF_peaks_200.bed"),
# mESC - serum
c("mESC_serum",
"ts201109_CTCF_borders_versus_histone_modifications_serum/HMM/mESC_norm_AD.bed.gz",
"Data_NQ/ChIP_NQ/HistoneModifications/Public_serum_ChIP/mESC_serum_CTCFPeaksOnly_ENCFF347BWU.bed"),
# mNPC
c("mNPC",
"/DATA/scratch/usr/t.v.schaik/proj/3D_nucleus/results/ts180110_4DN_DataProcessing/results_mouse/HMM/bin-10kb/NPC_LMNB1-10kb-combined_AD.bed.gz",
"/DATA/scratch/usr/t.v.schaik/proj/tests/results/ts181120_pADamID_mouse/analysis_CTCF_AID/Data_NQ/ChIP_NQ/CohesinFactors/5_WaplNPC-0h_CtcfChIP_sampleOnly_peaks.narrowPeak"),
# H1
c("H1",
"/DATA/scratch/usr/t.v.schaik/proj/3D_nucleus/results/ts180110_4DN_DataProcessing/results/HMM/bin-10kb/H1_LMNB1-10kb-combined_AD.bed.gz",
"/DATA/scratch/usr/t.v.schaik/proj/tests/results/ts181120_pADamID_mouse/analysis_CTCF_AID/ts200207_CTCF_peaks/ENCODE/ENCFF821AQO_H1.bed"),
# Hap1
c("Hap1",
"/DATA/scratch/usr/t.v.schaik/proj/3D_nucleus/results/ts180110_4DN_DataProcessing/results/HMM/bin-10kb/Hap1_LMNB1-10kb-combined_AD.bed.gz",
"/DATA/scratch/usr/c.leemans/projects/chip_snake/hg38/peaks/Hap1/Haarhuis2017/CTCF_GSM2493878_r1_SRR5266526_peaks.narrowPeak"),
# K562
c("K562",
"/DATA/scratch/usr/t.v.schaik/proj/3D_nucleus/results/ts180110_4DN_DataProcessing/results/HMM/bin-10kb/K562_LMNB1-10kb-combined_AD.bed.gz",
"/DATA/scratch/usr/c.leemans/projects/chip_snake/hg38/peaks/K562/Schmidl2015/CTCF_chipmentation_r2_SRR2085872_peaks.narrowPeak"),
# HCT116
c("HCT116",
"/DATA/scratch/usr/t.v.schaik/proj/3D_nucleus/results/ts180110_4DN_DataProcessing/results/HMM/bin-10kb/HCT116_LMNB1-10kb-combined_AD.bed.gz",
"/DATA/scratch/usr/t.v.schaik/proj/tests/results/ts181120_pADamID_mouse/analysis_CTCF_AID/ts200207_CTCF_peaks/ENCODE/ENCFF518MQA_HCT116.bed"))),
.name_repair = ~ c("cell", "LAD.file", "CTCF.file"))
metadata <- metadata %>%
mutate(cell = factor(cell, levels = unique(cell)))
# Also create one object for pA-DamID data
metadata.pa <- as_tibble(t(data.frame(
# mESC-pA
c("mESC_pA_PT",
"../results_NQ/HMM/bin-10kb/pADamID_PT-10kb-combined_AD.bed.gz",
"/DATA/scratch/usr/t.v.schaik/proj/tests/results/ts181120_pADamID_mouse/analysis_CTCF_AID/Data_NQ/ChIP_NQ/CohesinFactors/2_Wapl-0D-antiCtcf_sampleOnly_peaks.narrowPeak"),
# Hap1
c("Hap1_pA",
"/DATA/scratch/usr/t.v.schaik/proj/tests/results/ts180813_GCF5083_pADamIDtests/results/HMM/bin-10kb/Hap1_LMNB1-10kb-combined_AD.bed.gz",
"/DATA/scratch/usr/c.leemans/projects/chip_snake/hg38/peaks/Hap1/Haarhuis2017/CTCF_GSM2493878_r1_SRR5266526_peaks.narrowPeak"),
# K562
c("K562_pA",
"/DATA/scratch/usr/t.v.schaik/proj/tests/results/ts180813_GCF5083_pADamIDtests/results/HMM/bin-10kb/K562_LMNB1-10kb-combined_AD.bed.gz",
"/DATA/scratch/usr/c.leemans/projects/chip_snake/hg38/peaks/K562/Schmidl2015/CTCF_chipmentation_r2_SRR2085872_peaks.narrowPeak"),
# HCT116
c("HCT116_pA",
"/DATA/scratch/usr/t.v.schaik/proj/tests/results/ts180813_GCF5083_pADamIDtests/results/HMM/bin-10kb/HCT116_LMNB1-10kb-combined_AD.bed.gz",
"/DATA/scratch/usr/t.v.schaik/proj/tests/results/ts181120_pADamID_mouse/analysis_CTCF_AID/ts200207_CTCF_peaks/ENCODE/ENCFF518MQA_HCT116.bed"))),
.name_repair = ~ c("cell", "LAD.file", "CTCF.file"))
metadata.pa <- metadata.pa %>%
mutate(cell = factor(cell, levels = unique(cell))) Load LADs and prepare basic figures on LAD characteristics (number and size).
#######################################
## Load LADs
# Load LADs as list of GRanges
LADs <- LoadBed(metadata, "LAD.file", reduce = T, size_filter = T)
LADs.pa <- LoadBed(metadata.pa, "LAD.file", reduce = T, size_filter = T)
# Combine LADs into one GRanges with cells as argument
LADs.unlist <- unlist(LADs)
#######################################
## Basic figures
# Plot number of LADs
as_tibble(LADs) %>%
ggplot(aes(x = cell, fill = cell)) +
geom_bar() +
ggtitle("LAD count") +
xlab("Cell type") +
ylab("Count") +
scale_fill_brewer(palette = "Set1", name = "Cell type", guide = F) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))# Plot size distribution of LADs
as_tibble(LADs) %>%
ggplot(aes(x = width / 1e6, col = cell)) +
geom_density() +
ggtitle("LAD size") +
xlab("LAD size (Mb)") +
ylab("Density") +
scale_color_brewer(palette = "Set1", name = "Cell type") +
scale_x_log10() +
theme_bw() +
theme(aspect.ratio = 1)Load CTCF peaks and prepare basic figures on characteristics (number).
I want to add motif orientation to all comparisons, I will use motif orientation from FIMO (as Robin did).
#######################################
## Load CTCF sites
# Load LADs as list of GRanges
CTCF.sites <- LoadBed(metadata, "CTCF.file")
CTCF.sites.pa <- LoadBed(metadata.pa, "CTCF.file")
# Combine LADs into one GRanges with cells as argument
CTCF.sites.unlist <- unlist(CTCF.sites)
#######################################
## Basic figures
# Plot number of CTCF peaks
as_tibble(CTCF.sites) %>%
ggplot(aes(x = cell, fill = cell)) +
geom_bar() +
ggtitle("CTCF site count") +
xlab("Cell type") +
ylab("Count") +
scale_fill_brewer(palette = "Set1", name = "Cell type", guide = F) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))# #######################################
# ## Add orientation to mESC CTCF peaks
#
# Prepare motif orientation from FIMO output (MEME suite)
fimo_mm10 <- read_tsv("ts210201_CTCF_enrichment_at_LAD_borders/ctcf_peaks/fimo_mm10.tsv") %>%
dplyr::rename(seqnames = "sequence_name",
sequence = "matched_sequence") %>%
drop_na() %>%
as(., "GRanges") %>%
sort(.)
export.bed(fimo_mm10, "ts210201_CTCF_enrichment_at_LAD_borders/ctcf_peaks/fimo_mm10.bed")
fimo_hg38 <- read_tsv("ts210201_CTCF_enrichment_at_LAD_borders/ctcf_peaks/fimo_hg38.tsv") %>%
dplyr::rename(seqnames = "sequence_name",
sequence = "matched_sequence") %>%
drop_na() %>%
as(., "GRanges") %>%
sort(.)
export.bed(fimo_hg38, "ts210201_CTCF_enrichment_at_LAD_borders/ctcf_peaks/fimo_hg38.bed")
MotifOrientation <- function(f_orientation, gr_sites, dis_cutoff = 50) {
# Update GRanges (gr_sites) with orientation (from f_orien...)
# With a max distance of (dis_cutoff)
# Load orientation file
ctcf.orientation <- as_tibble(f_orientation)
ctcf.orientation$motif <- 1:nrow(ctcf.orientation)
# Save as bed files
export.bed(as_tibble(ctcf.orientation) %>% filter(strand == "+"),
file.path(output_dir, "ctcf_motif_plus.bed"))
export.bed(as_tibble(ctcf.orientation) %>% filter(strand == "-"),
file.path(output_dir, "ctcf_motif_minus.bed"))
# Add this to the CTCF peaks where possible
dis <- as_tibble(distanceToNearest(as(ctcf.orientation, "GRanges"),
gr_sites))
# Filter for reasonable distances
dis <- dis %>%
filter(distance < 50) %>%
arrange(distance) %>%
rename_at(1:2, ~ c("motif", "peak"))
# Add motif numbers
dis_bypeak <- dis %>%
left_join(ctcf.orientation %>%
dplyr::select(motif, strand, score)) %>%
group_by(peak) %>%
summarise(n = n(),
strand = case_when(all(strand == "+") ~ "+",
all(strand == "-") ~ "-",
T ~ "both"))
# Add the CTCF sites
gr_sites$motif_count <- 0
gr_sites$motif_strand <- NA
gr_sites$motif_count[dis_bypeak$peak] <- dis_bypeak$n
gr_sites$motif_strand[dis_bypeak$peak] <- dis_bypeak$strand
gr_sites
}
# Add orientation based on orientation file
for (cell in metadata$cell) {
if (cell %in% c("mESC", "mESC_serum", "mNPC")) {
CTCF.sites[[cell]] <- MotifOrientation(fimo_mm10,
CTCF.sites[[cell]])
} else {
CTCF.sites[[cell]] <- MotifOrientation(fimo_hg38,
CTCF.sites[[cell]])
}
}
for (cell in metadata.pa$cell) {
if (cell %in% c("mESC_pA_PT")) {
CTCF.sites.pa[[cell]] <- MotifOrientation(fimo_mm10,
CTCF.sites.pa[[cell]])
} else {
CTCF.sites.pa[[cell]] <- MotifOrientation(fimo_hg38,
CTCF.sites.pa[[cell]])
}
}Finally, I will prepare lists with all the genomic bins to use as a background distribution.
#######################################
## As control, also do this for all 10kb bins
# Read 10kb bins (for human and mouse)
bins.10kb.human <- read_tsv("/DATA/scratch/usr/t.v.schaik/proj/3D_nucleus/results/ts180110_4DN_DataProcessing/results/counts/bin-10kb/K562_r1_LMNB1-10kb.counts.txt.gz", col_names = c("seqnames", "start", "end", "count"))
bins.10kb.human <- as(bins.10kb.human, "GRanges")
bins.10kb.human$species <- "human"
start(bins.10kb.human) <- start(bins.10kb.human) + 1
bins.10kb.mouse <- read_tsv("../results_NQ/counts/bin-10kb/pADamID_CTCF-EL_0h_Dam_r1-10kb.counts.txt.gz",
col_names = c("seqnames", "start", "end", "count"))
bins.10kb.mouse <- as(bins.10kb.mouse, "GRanges")
bins.10kb.mouse$species <- "mouse"
start(bins.10kb.mouse) <- start(bins.10kb.mouse) + 1
# Prepare similar GRangesList as the CTCF sites
bins.10kb <- GRangesList(mESC = bins.10kb.mouse,
mESC_serum = bins.10kb.mouse,
mNPC = bins.10kb.mouse,
H1 = bins.10kb.human,
Hap1 = bins.10kb.human,
K562 = bins.10kb.human,
HCT116 = bins.10kb.human)
# HFF = bins.10kb.human,
# HEPG2 = bins.10kb.human)
# RPE = bins.10kb.human)
bins.10kb.pa <- GRangesList(mESC_pA_PT = bins.10kb.mouse,
Hap1_pA = bins.10kb.human,
K562_pA = bins.10kb.human,
HCT116_pA = bins.10kb.human)LADs are usually depleted in CTCF sites. Can I confirm this in the data here?
#######################################
## Add LAD overlap to CTCF sites
# Loop over the cells and add LAD overlap
for (cell in metadata$cell) {
# Get the cell-specific data
cell.CTCF <- CTCF.sites[[cell]]
cell.LADs <- LADs[[cell]]
# Add LAD overlap
cell.CTCF$which.LAD <- cell.CTCF$overlap.LAD <- NA
dis <- as_tibble(distanceToNearest(cell.CTCF, cell.LADs))
cell.CTCF$which.LAD[dis$queryHits] <- dis$subjectHits
cell.CTCF$overlap.LAD[dis$queryHits] <- dis$distance == 0
# Save data
CTCF.sites[[cell]] <- cell.CTCF
}
for (cell in metadata.pa$cell) {
# Get the cell-specific data
cell.CTCF <- CTCF.sites.pa[[cell]]
cell.LADs <- LADs.pa[[cell]]
# Add LAD overlap
cell.CTCF$which.LAD <- cell.CTCF$overlap.LAD <- NA
dis <- as_tibble(distanceToNearest(cell.CTCF, cell.LADs))
cell.CTCF$which.LAD[dis$queryHits] <- dis$subjectHits
cell.CTCF$overlap.LAD[dis$queryHits] <- dis$distance == 0
# Save data
CTCF.sites.pa[[cell]] <- cell.CTCF
}
#######################################
## Plot fraction in LAD
# Generate summary
tib <- as_tibble(unlist(CTCF.sites)) %>%
group_by(cell) %>%
summarise(count = n(),
fraction = mean(overlap.LAD, na.rm = T))
# Random distribution of CTCF sites
tib.random <- as_tibble(do.call(rbind,
lapply(metadata$cell,
function(cell) c(cell,
sum(bins.10kb[[cell]] %over% LADs[[cell]]) /
length(bins.10kb[[cell]])))),
.name_repair = ~ c("cell", "expected"))
# Plot fraction in LADs and expectation
tib %>%
ggplot(aes(x = cell, y = fraction)) +
geom_bar(stat = "identity") +
geom_point(data = tib.random, aes(x = cell, y = expected),
col = "red", size = 3) +
ggtitle("CTCF enrichment in LADs") +
ylim(0, 1) +
xlab("Cell type") +
ylab("Fraction peaks in LADs") +
scale_fill_brewer(palette = "Set1", name = "Cell type", guide = F) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))Note that the red dot shows the expected count, based on the total size of all LADs combined. In all cell lines, CTCF peaks are depleted in LADs.
Next, get the LAD borders from the LAD domains. Extract LAD borders with orientation, where “+” is left border and “-” is the left border.
#######################################
## Get LAD borders
# Minimum distance that LADs should be apart
plotting.window <- 0
# Get the borders
LAD.borders <- LADBorders(LADs, bins.10kb, min.distance = plotting.window)
LAD.borders.pa <- LADBorders(LADs.pa, bins.10kb.pa, min.distance = plotting.window)Update 20-07-22: it might be good to remove LAD borders with active genes, which can further impact results. Let’s mark those LAD borders. (Without removing them)
# Load expression
genes_mouse <- readRDS("ts200604_GeneExpression/genes.rds")
fpkm_mesc <- readRDS("ts200604_GeneExpression/genes_fpkm_mean.rds")
# Expand genes - genes
genes_mouse_expand <- genes_mouse
start(genes_mouse_expand) <- start(genes_mouse) - (bin.size+1)
end(genes_mouse_expand) <- end(genes_mouse) + (bin.size+1)
# Mark borders close to genes
ovl <- as_tibble(findOverlaps(LAD.borders[["mESC"]],
genes_mouse_expand,
ignore.strand = T)) %>%
mutate(strand = as.character(strand(genes_mouse)[subjectHits]),
expression = fpkm_mesc$WT_0h[subjectHits]) %>%
filter(expression > 1) %>%
group_by(queryHits) %>%
dplyr::summarise(max_expression = max(expression),
min_expression = min(expression),
strand = case_when(all(strand == "+") ~ "+",
all(strand == "-") ~ "-",
T ~ "ambiguous"))
LAD.borders[["mESC"]]$ovl_gene <- (1:length(LAD.borders[["mESC"]])) %in% ovl$queryHits
LAD.borders[["mESC"]]$strand_of_gene <- factor(NA, levels = c("+", "-", "ambiguous"))
LAD.borders[["mESC"]]$max_gene_expression <- 0
LAD.borders[["mESC"]]$strand_of_gene[ovl$queryHits] <- ovl$strand
LAD.borders[["mESC"]]$max_gene_expression[ovl$queryHits] <- ovl$max_expression
# Mark borders close to genes - pA-DamID data
ovl <- as_tibble(findOverlaps(LAD.borders.pa[["mESC_pA_PT"]],
genes_mouse_expand,
ignore.strand = T)) %>%
mutate(strand = as.character(strand(genes_mouse)[subjectHits]),
expression = fpkm_mesc$WT_0h[subjectHits]) %>%
filter(expression > 1) %>%
group_by(queryHits) %>%
dplyr::summarise(max_expression = max(expression),
min_expression = min(expression),
strand = case_when(all(strand == "+") ~ "+",
all(strand == "-") ~ "-",
T ~ "ambiguous"))
LAD.borders.pa[["mESC_pA_PT"]]$ovl_gene <- (1:length(LAD.borders.pa[["mESC_pA_PT"]])) %in% ovl$queryHits
LAD.borders.pa[["mESC_pA_PT"]]$strand_of_gene <- factor(NA, levels = c("+", "-", "ambiguous"))
LAD.borders.pa[["mESC_pA_PT"]]$max_gene_expression <- 0
LAD.borders.pa[["mESC_pA_PT"]]$strand_of_gene[ovl$queryHits] <- ovl$strand
LAD.borders.pa[["mESC_pA_PT"]]$max_gene_expression[ovl$queryHits] <- ovl$max_expression
# Also get TSS and TES
tss_mouse <- tes_mouse <- genes_mouse
mcols(tss_mouse) <- mcols(tes_mouse) <- data.frame(motif_count = 1,
motif_strand = strand(genes_mouse))
start(tss_mouse) <- end(tss_mouse) <- ifelse(strand(genes_mouse) == "+",
start(genes_mouse),
end(genes_mouse))
start(tes_mouse) <- end(tes_mouse) <- ifelse(strand(genes_mouse) == "-",
start(genes_mouse),
end(genes_mouse))# Load expression
fpkm_mNPC <- full_join(read_tsv(file.path(output_dir, "gene_expression", "GSM2533845_NPC_rep1.txt.gz")) %>%
dplyr::rename(npc_r1 = "fpkm"),
read_tsv(file.path(output_dir, "gene_expression", "GSM2533846_NPC_rep2.txt.gz")) %>%
dplyr::rename(npc_r2 = "fpkm")) %>%
mutate(mNPC = (npc_r1 + npc_r2) / 2) %>%
dplyr::rename(ensembl_id = "ENSEMBL_ID")
fpkm_mNPC <- left_join(fpkm_mesc %>% dplyr::select(ensembl_id),
fpkm_mNPC)
# Mark borders close to genes
ovl <- as_tibble(findOverlaps(LAD.borders[["mNPC"]],
genes_mouse_expand,
ignore.strand = T)) %>%
mutate(strand = as.character(strand(genes_mouse)[subjectHits]),
expression = fpkm_mNPC$mNPC[subjectHits]) %>%
filter(expression > 1) %>%
group_by(queryHits) %>%
dplyr::summarise(max_expression = max(expression),
min_expression = min(expression),
strand = case_when(all(strand == "+") ~ "+",
all(strand == "-") ~ "-",
T ~ "ambiguous"))
LAD.borders[["mNPC"]]$ovl_gene <- (1:length(LAD.borders[["mNPC"]])) %in% ovl$queryHits
LAD.borders[["mNPC"]]$strand_of_gene <- factor(NA, levels = c("+", "-", "ambiguous"))
LAD.borders[["mNPC"]]$max_gene_expression <- 0
LAD.borders[["mNPC"]]$strand_of_gene[ovl$queryHits] <- ovl$strand
LAD.borders[["mNPC"]]$max_gene_expression[ovl$queryHits] <- ovl$max_expressionI will also do this for the human data. Simple solution: repeat code several times.
# Load expression
genes_human <- readRDS("~/mydata/proj/3D_nucleus/results/ts191220_laminaVsNucleolus_NewAnalyses/ts200113_GeneExpression/genes.rds")
fpkm_human <- readRDS("~/mydata/proj/3D_nucleus/results/ts191220_laminaVsNucleolus_NewAnalyses/ts200113_GeneExpression/tib_fpkm.rds")
# Expand genes
genes_human_expand <- genes_human
start(genes_human_expand) <- start(genes_human) - (bin.size+1)
end(genes_human_expand) <- end(genes_human) + (bin.size+1)
# Mark borders close to genes - H1 cells
ovl <- as_tibble(findOverlaps(LAD.borders[["H1"]],
genes_human_expand,
ignore.strand = T)) %>%
mutate(strand = as.character(strand(genes_human)[subjectHits]),
expression = fpkm_human$H1_expr[subjectHits]) %>%
filter(expression > 1) %>%
group_by(queryHits) %>%
dplyr::summarise(max_expression = max(expression),
min_expression = min(expression),
strand = case_when(all(strand == "+") ~ "+",
all(strand == "-") ~ "-",
T ~ "ambiguous"))
LAD.borders[["H1"]]$ovl_gene <- (1:length(LAD.borders[["H1"]])) %in% ovl$queryHits
LAD.borders[["H1"]]$strand_of_gene <- factor(NA, levels = c("+", "-", "ambiguous"))
LAD.borders[["H1"]]$max_gene_expression <- 0
LAD.borders[["H1"]]$strand_of_gene[ovl$queryHits] <- ovl$strand
LAD.borders[["H1"]]$max_gene_expression[ovl$queryHits] <- ovl$max_expression
# Mark borders close to genes - Hap1 cells
ovl <- as_tibble(findOverlaps(LAD.borders[["Hap1"]],
genes_human_expand,
ignore.strand = T)) %>%
mutate(strand = as.character(strand(genes_human)[subjectHits]),
expression = fpkm_human$Hap1_expr[subjectHits]) %>%
filter(expression > 1) %>%
group_by(queryHits) %>%
dplyr::summarise(max_expression = max(expression),
min_expression = min(expression),
strand = case_when(all(strand == "+") ~ "+",
all(strand == "-") ~ "-",
T ~ "ambiguous"))
LAD.borders[["Hap1"]]$ovl_gene <- (1:length(LAD.borders[["Hap1"]])) %in% ovl$queryHits
LAD.borders[["Hap1"]]$strand_of_gene <- factor(NA, levels = c("+", "-", "ambiguous"))
LAD.borders[["Hap1"]]$max_gene_expression <- 0
LAD.borders[["Hap1"]]$strand_of_gene[ovl$queryHits] <- ovl$strand
LAD.borders[["Hap1"]]$max_gene_expression[ovl$queryHits] <- ovl$max_expression
ovl <- as_tibble(findOverlaps(LAD.borders.pa[["Hap1_pA"]],
genes_human_expand,
ignore.strand = T)) %>%
mutate(strand = as.character(strand(genes_human)[subjectHits]),
expression = fpkm_human$Hap1_expr[subjectHits]) %>%
filter(expression > 1) %>%
group_by(queryHits) %>%
dplyr::summarise(max_expression = max(expression),
min_expression = min(expression),
strand = case_when(all(strand == "+") ~ "+",
all(strand == "-") ~ "-",
T ~ "ambiguous"))
LAD.borders.pa[["Hap1_pA"]]$ovl_gene <- (1:length(LAD.borders.pa[["Hap1_pA"]])) %in% ovl$queryHits
LAD.borders.pa[["Hap1_pA"]]$strand_of_gene <- factor(NA, levels = c("+", "-", "ambiguous"))
LAD.borders.pa[["Hap1_pA"]]$max_gene_expression <- 0
LAD.borders.pa[["Hap1_pA"]]$strand_of_gene[ovl$queryHits] <- ovl$strand
LAD.borders.pa[["Hap1_pA"]]$max_gene_expression[ovl$queryHits] <- ovl$max_expression
# Mark borders close to genes - HCT116 cells
ovl <- as_tibble(findOverlaps(LAD.borders[["HCT116"]],
genes_human_expand,
ignore.strand = T)) %>%
mutate(strand = as.character(strand(genes_human)[subjectHits]),
expression = fpkm_human$HCT116_expr[subjectHits]) %>%
filter(expression > 1) %>%
group_by(queryHits) %>%
dplyr::summarise(max_expression = max(expression),
min_expression = min(expression),
strand = case_when(all(strand == "+") ~ "+",
all(strand == "-") ~ "-",
T ~ "ambiguous"))
LAD.borders[["HCT116"]]$ovl_gene <- (1:length(LAD.borders[["HCT116"]])) %in% ovl$queryHits
LAD.borders[["HCT116"]]$strand_of_gene <- factor(NA, levels = c("+", "-", "ambiguous"))
LAD.borders[["HCT116"]]$max_gene_expression <- 0
LAD.borders[["HCT116"]]$strand_of_gene[ovl$queryHits] <- ovl$strand
LAD.borders[["HCT116"]]$max_gene_expression[ovl$queryHits] <- ovl$max_expression
ovl <- as_tibble(findOverlaps(LAD.borders.pa[["HCT116_pA"]],
genes_human_expand,
ignore.strand = T)) %>%
mutate(strand = as.character(strand(genes_human)[subjectHits]),
expression = fpkm_human$HCT116_expr[subjectHits]) %>%
filter(expression > 1) %>%
group_by(queryHits) %>%
dplyr::summarise(max_expression = max(expression),
min_expression = min(expression),
strand = case_when(all(strand == "+") ~ "+",
all(strand == "-") ~ "-",
T ~ "ambiguous"))
LAD.borders.pa[["HCT116_pA"]]$ovl_gene <- (1:length(LAD.borders.pa[["HCT116_pA"]])) %in% ovl$queryHits
LAD.borders.pa[["HCT116_pA"]]$strand_of_gene <- factor(NA, levels = c("+", "-", "ambiguous"))
LAD.borders.pa[["HCT116_pA"]]$max_gene_expression <- 0
LAD.borders.pa[["HCT116_pA"]]$strand_of_gene[ovl$queryHits] <- ovl$strand
LAD.borders.pa[["HCT116_pA"]]$max_gene_expression[ovl$queryHits] <- ovl$max_expression
# Mark borders close to genes - K562 cells
ovl <- as_tibble(findOverlaps(LAD.borders[["K562"]],
genes_human_expand,
ignore.strand = T)) %>%
mutate(strand = as.character(strand(genes_human)[subjectHits]),
expression = fpkm_human$K562_expr[subjectHits]) %>%
filter(expression > 1) %>%
group_by(queryHits) %>%
dplyr::summarise(max_expression = max(expression),
min_expression = min(expression),
strand = case_when(all(strand == "+") ~ "+",
all(strand == "-") ~ "-",
T ~ "ambiguous"))
LAD.borders[["K562"]]$ovl_gene <- (1:length(LAD.borders[["K562"]])) %in% ovl$queryHits
LAD.borders[["K562"]]$strand_of_gene <- factor(NA, levels = c("+", "-", "ambiguous"))
LAD.borders[["K562"]]$max_gene_expression <- 0
LAD.borders[["K562"]]$strand_of_gene[ovl$queryHits] <- ovl$strand
LAD.borders[["K562"]]$max_gene_expression[ovl$queryHits] <- ovl$max_expression
ovl <- as_tibble(findOverlaps(LAD.borders.pa[["K562_pA"]],
genes_human_expand,
ignore.strand = T)) %>%
mutate(strand = as.character(strand(genes_human)[subjectHits]),
expression = fpkm_human$K562_expr[subjectHits]) %>%
filter(expression > 1) %>%
group_by(queryHits) %>%
dplyr::summarise(max_expression = max(expression),
min_expression = min(expression),
strand = case_when(all(strand == "+") ~ "+",
all(strand == "-") ~ "-",
T ~ "ambiguous"))
LAD.borders.pa[["K562_pA"]]$ovl_gene <- (1:length(LAD.borders.pa[["K562_pA"]])) %in% ovl$queryHits
LAD.borders.pa[["K562_pA"]]$strand_of_gene <- factor(NA, levels = c("+", "-", "ambiguous"))
LAD.borders.pa[["K562_pA"]]$max_gene_expression <- 0
LAD.borders.pa[["K562_pA"]]$strand_of_gene[ovl$queryHits] <- ovl$strand
LAD.borders.pa[["K562_pA"]]$max_gene_expression[ovl$queryHits] <- ovl$max_expression
# Also get TSS and TES
tss_human <- tes_human <- genes_human
mcols(tss_human) <- mcols(tes_human) <- data.frame(motif_count = 1,
motif_strand = strand(genes_human))
start(tss_human) <- end(tss_human) <- ifelse(strand(genes_human) == "+",
start(genes_human),
end(genes_human))
start(tes_human) <- end(tes_human) <- ifelse(strand(genes_human) == "-",
start(genes_human),
end(genes_human))To compare CTCF density to LAD borders, I first need to compute the distance to LAD borders.
#######################################
## Calculate CTCF positioning near LAD borders
# Also select bins overlapping a CTCF site for a different perspective
bins.10kb.CTCF <- bins.10kb
for (cell in names(bins.10kb.CTCF)) {
cell.bins <- bins.10kb.CTCF[[cell]]
cell.CTCF <- grMid(CTCF.sites[[cell]])
cell.bins <- cell.bins[overlapsAny(cell.bins, cell.CTCF)]
bins.10kb.CTCF[[cell]] <- cell.bins
}
bins.10kb.CTCF.pa <- bins.10kb.pa
for (cell in names(bins.10kb.CTCF.pa)) {
cell.bins <- bins.10kb.CTCF.pa[[cell]]
cell.CTCF <- grMid(CTCF.sites.pa[[cell]])
cell.bins <- cell.bins[overlapsAny(cell.bins, cell.CTCF)]
bins.10kb.CTCF.pa[[cell]] <- cell.bins
}
# Distance to LAD border
CTCF.sites <- CTCFDistance(CTCF.sites, LAD.borders)
bins.10kb <- CTCFDistance(bins.10kb, LAD.borders)
bins.10kb.CTCF <- CTCFDistance(bins.10kb.CTCF, LAD.borders)
CTCF.sites.pa <- CTCFDistance(CTCF.sites.pa, LAD.borders.pa)
bins.10kb.pa <- CTCFDistance(bins.10kb.pa, LAD.borders.pa)
bins.10kb.CTCF.pa <- CTCFDistance(bins.10kb.CTCF.pa, LAD.borders.pa)
#######################################
## Get count per 10kb bins for CTCF and control bins
# Count occurences
CTCF.distances <- CountPerBins(CTCF.sites, sample_name = "CTCF", bin.size = bin.size)
bins.distances <- CountPerBins(bins.10kb, sample_name = "bins", bin.size = bin.size)
bins.CTCF.distances <- CountPerBins(bins.10kb.CTCF, sample_name = "bins.CTCF", bin.size = bin.size)
CTCF.distances.pa <- CountPerBins(CTCF.sites.pa, sample_name = "CTCF", bin.size = bin.size)
bins.distances.pa <- CountPerBins(bins.10kb.pa, sample_name = "bins", bin.size = bin.size)
bins.CTCF.distances.pa <- CountPerBins(bins.10kb.CTCF.pa, sample_name = "bins.CTCF", bin.size = bin.size)
# Calculate ratios
distances <- CTCF.distances %>%
full_join(bins.distances) %>%
full_join(bins.CTCF.distances) %>%
rowwise() %>%
mutate(ratio = CTCF / bins,
ratio_withCTCF = bins.CTCF / bins) %>%
ungroup() %>%
filter(bins > 50)
distances.pa <- CTCF.distances.pa %>%
full_join(bins.distances.pa) %>%
full_join(bins.CTCF.distances.pa) %>%
rowwise() %>%
mutate(ratio = CTCF / bins,
ratio_withCTCF = bins.CTCF / bins) %>%
ungroup() %>%
filter(bins > 50)Having the distances and a summary table, I can do some plotting. I will plot the results in two ways:
#######################################
## Plot the distances
# Plot the CTCF density for 10kb bins (100x 10kb = 1Mb)
distances %>%
filter(cell != "mESC_serum") %>%
ggplot(aes(x = value / scaling, y = ratio)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(col = "red", size = 1) +
facet_grid(cell ~ .) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("CTCF-bound sites per 10kb") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.35)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))# Plot the fraction of bins with CTCF peaks
distances %>%
filter(cell != "mESC_serum") %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(col = "red", size = 1) +
facet_grid(cell ~ .) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.37)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))# Zoom-in
distances %>%
filter(cell != "mESC_serum") %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(col = "red", size = 1) +
facet_grid(cell ~ .) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.1, 0.1), ylim = c(0, 0.37)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))# For pA data sets
distances.pa %>%
ggplot(aes(x = value / scaling, y = ratio)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5, fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(col = "red", size = 1) +
facet_grid(cell ~ .) +
ggtitle("CTCF enrichment at pA-DamID LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("CTCF-bound sites per 10kb") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.43)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))distances.pa %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(col = "red", size = 1) +
facet_grid(cell ~ .) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.35)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))# Zoom-in
distances.pa %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(col = "red", size = 1) +
facet_grid(cell ~ .) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.1, 0.1), ylim = c(0, 0.35)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))It seems that CTCF peaks are indeed enriched surrounding LAD borders for some cells. The enrichment in mESC cells is not very strong, but I would argue still present.
From the previous analysis I know that the CTCF peak occurs at distances 15000-5000 bases before the LAD border. Here, I will classify LAD borders into borders with and without CTCF, using these cutoffs.
# Classify LAD borders
LAD.borders <- ClassifyLADBorders(LAD.borders, CTCF.sites, LADs)
# Plot amount
as_tibble(LAD.borders) %>%
dplyr::select(group_name, CTCF) %>%
group_by(group_name) %>%
summarise(mean(CTCF == "CTCF")) %>%
rename_at(vars(names(.)), ~ c("cell", "fraction")) %>%
mutate(cell = factor(cell, levels = levels(metadata$cell))) %>%
ggplot(aes(x = cell, y = fraction, fill = cell)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set1", name = "Cell type", guide = F) +
ggtitle("LAD border classification with CTCF status") +
xlab(NULL) +
ylab("Fraction borders defined as CTCF borders") +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))# Classify LAD borders
LAD.borders.pa <- ClassifyLADBorders(LAD.borders.pa, CTCF.sites.pa, LADs.pa,
cells = levels(metadata.pa$cell))
# Plot amount
as_tibble(LAD.borders.pa) %>%
dplyr::select(group_name, CTCF) %>%
group_by(group_name) %>%
summarise(mean(CTCF == "CTCF")) %>%
rename_at(vars(names(.)), ~ c("cell", "fraction")) %>%
mutate(cell = factor(cell, levels = levels(metadata.pa$cell))) %>%
ggplot(aes(x = cell, y = fraction, fill = cell)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set1", name = "Cell type", guide = F) +
ggtitle("LAD border classification with CTCF status") +
xlab(NULL) +
ylab("Fraction borders defined as CTCF borders") +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))The previous figures show the enrichment of CTCF at LAD borders. However, CTCF binds the genome in an oriented manner, and we should treat this accordingly. Let’s show how the CTCF motif is oriented at LAD borders.
# Classify LAD borders based on CTCF orientation
ClassifyLADBordersOrientation <- function(borders_mESC, ctcf_sites, LADs_mESC, overlapping = F,
ranges = c(0, 20000), col_name = "CTCF") {
# Remove overlapping sites
if (! overlapping) ctcf_sites <- ctcf_sites[! overlapsAny(ctcf_sites, LADs_mESC)]
# Get LAD borders with CTCF peaks within the given range
dis_borders <- as_tibble(distanceToNearest(borders_mESC, ctcf_sites, ignore.strand = T))
dis_sites <- as_tibble(distanceToNearest(ctcf_sites, borders_mESC, ignore.strand = T))
# Determine uniqueness of borders - from ctcf peaks
dis_sites <- dis_sites %>%
filter(distance < ranges[2]) %>%
mutate(border = subjectHits,
border_strand = as.character(strand(borders_mESC)[subjectHits]),
motif_strand = as.character(ctcf_sites$motif_strand[queryHits]),
motif_orientation = as.character(ctcf_sites$motif_orientation[queryHits])) %>%
group_by(border) %>%
summarise(mESC_inwards = all(motif_orientation == "mESC_inwards"),
mESC_outwards = all(motif_orientation == "mESC_outwards")) %>%
ungroup() %>%
mutate(class = case_when(mESC_inwards == T ~ "inwards",
mESC_outwards == T ~ "outwards",
T ~ "ambiguous"))
# Determine uniqueness of borders - from borders
dis_borders <- dis_borders %>%
filter(distance < ranges[2]) %>%
mutate(border = queryHits,
border_strand_border = as.character(strand(borders_mESC)[queryHits]),
motif_strand_border = as.character(ctcf_sites$motif_strand[subjectHits]),
motif_orientation_border = as.character(ctcf_sites$motif_orientation[subjectHits])) %>%
mutate(class_border = case_when(motif_orientation_border == "mESC_both" ~ "ambiguous",
motif_orientation_border == "mESC_inwards" ~ "inwards",
motif_orientation_border == "mESC_outwards" ~ "outwards")) %>%
dplyr::select(-queryHits, -subjectHits)
# Combine
border_class <- full_join(dis_sites, dis_borders) %>%
mutate(class = case_when(is.na(class) ~ "ambiguous",
T ~ class))
# Classify borders
mcols(borders_mESC)[, col_name] <- "nonCTCF"
mcols(borders_mESC)[border_class$border, col_name] <- border_class$class
borders_mESC
}
CTCFOrientationEnrichment <- function(gr_sites, gr_lads, gr_borders, gr_bins, cell) {
original_cell <- cell
# Get ctcf peaks
ctcf <- as_tibble(gr_sites) %>%
rowwise() %>%
mutate(distance = ifelse(overlap.LAD == T,
max(dis.precede, dis.follow, na.rm = T),
- max(dis.precede, dis.follow, na.rm = T)),
at_border = distance <= 0 & distance > -20e3,
border = case_when(overlap.LAD == T & is.na(dis.follow) ~ "right",
overlap.LAD == F & is.na(dis.precede) ~ "right",
T ~ "left")) %>%
ungroup()
# Update CTCF sites
gr_sites$distance <- ctcf$distance
gr_sites$border <- ctcf$border
gr_sites$at_border <- ctcf$at_border
# Which bin size should I use for this?
ctcf_bin_size <- bin.size
scaling = 1e6 / ctcf_bin_size
# Separate CTCF outwards and inwards of the LAD border
ctcf_outwards <- ctcf %>%
filter(motif_count > 0,
! motif_strand == "both",
! (border == "right" & motif_strand == "-"),
! (border == "left" & motif_strand == "+")) %>%
mutate(cell = paste0(cell, "_outwards"))
ctcf_inwards <- ctcf %>%
filter(motif_count > 0,
! motif_strand == "both",
! (border == "left" & motif_strand == "-"),
! (border == "right" & motif_strand == "+")) %>%
mutate(cell = paste0(cell, "_inwards"))
ctcf_both <- ctcf %>%
filter(motif_count > 0,
motif_strand == "both") %>%
mutate(cell = paste0(cell, "_both"))
ctcf <- ctcf %>%
mutate(motif_orientation = case_when((motif_count > 0 &
! motif_strand == "both" &
! (border == "right" & motif_strand == "-") &
! (border == "left" & motif_strand == "+")) ~ "mESC_outwards",
(motif_count > 0 &
! motif_strand == "both" &
! (border == "left" & motif_strand == "-") &
! (border == "right" & motif_strand == "+")) ~ "mESC_inwards",
(motif_count > 0 &
motif_strand == "both") ~ paste0(cell, "_both"),
T ~ paste0(cell, "_nostrand")))
gr_ctcf <- as(ctcf, "GRanges")
# Prepare lists for functions made previously
ctcf_list <- GRangesList(outwards = as(ctcf_outwards, "GRanges"),
inwards = as(ctcf_inwards, "GRanges"),
both = as(ctcf_both, "GRanges"))
border_list <- GRangesList(outwards = gr_borders,
inwards = gr_borders,
both = gr_borders)
bins_10kb <- GRangesList(outwards = gr_bins,
inwards = gr_bins,
both = gr_bins)
LAD_list <- GRangesList(outwards = gr_lads,
inwards = gr_lads,
both = gr_lads)
# Also select bins overlapping a CTCF site for a different perspective
bins_10kb_CTCF <- bins_10kb
for (cell in names(bins_10kb_CTCF)) {
cell_bins <- bins_10kb_CTCF[[cell]]
cell_CTCF <- grMid(ctcf_list[[cell]])
cell_bins <- cell_bins[overlapsAny(cell_bins, cell_CTCF)]
bins_10kb_CTCF[[cell]] <- cell_bins
}
# Distance to LAD border
ctcf_list <- CTCFDistance(ctcf_list, border_list)
bins_10kb <- CTCFDistance(bins_10kb, border_list)
bins_10kb_CTCF <- CTCFDistance(bins_10kb_CTCF, border_list)
# Count occurences
cell_distances <- CountPerBins(ctcf_list, sample_name = "CTCF", bin.size = ctcf_bin_size)
bins_distances <- CountPerBins(bins_10kb, sample_name = "bins", bin.size = ctcf_bin_size)
bins_CTCF_distances <- CountPerBins(bins_10kb_CTCF, sample_name = "bins_CTCF", bin.size = ctcf_bin_size)
# Calculate ratios
distances <- cell_distances %>%
full_join(bins_distances) %>%
full_join(bins_CTCF_distances) %>%
rowwise() %>%
mutate(ratio = CTCF / bins,
ratio_withCTCF = bins_CTCF / bins,
original_cell = original_cell) %>%
ungroup() %>%
filter(bins > 50,
cell != "both")
# Plot the fraction of bins with CTCF peaks
plt <- distances %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF, col = cell)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(size = 1) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.16)) +
scale_color_brewer(palette = "Set2") +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
#plot(plt)
plt <- distances %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(size = 1, col = "red") +
facet_grid(. ~ cell) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.16)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
#plot(plt)
# Classify LAD borders
gr_borders <- ClassifyLADBordersOrientation(gr_borders, gr_ctcf, gr_lads,
col_name = c("CTCF_strand"))
# Plot amount
plt <- as_tibble(gr_borders) %>%
group_by(CTCF_strand) %>%
summarise(n = n()) %>%
mutate(fraction = n / sum(n)) %>%
mutate(CTCF_strand = factor(CTCF_strand,
levels = c("outwards", "inwards", "ambiguous", "nonCTCF"))) %>%
ggplot(aes(x = CTCF_strand, y = fraction, fill = CTCF_strand)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set1", name = "Cell type", guide = F) +
ggtitle("LAD border classification with CTCF status") +
xlab(NULL) +
ylab("Fraction borders defined as CTCF borders") +
scale_fill_brewer(palette = "Set2", name = "Border class") +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
#plot(plt)
# Also, update CTCF peaks
tib <- as_tibble(gr_sites) %>%
dplyr::select(seqnames, start, end, which.LAD, border, at_border) %>%
mutate(border_idx = 2 * which.LAD - ifelse(border == "left", 1, 0),
border_class = gr_borders$CTCF_strand[border_idx])
gr_sites$border_idx <- tib$border_idx
gr_sites$border_class <- tib$border_class
# Return objects
list(gr_sites, gr_borders, distances)
}
# 1) DamID data
all_distances <- tibble()
for (cell in metadata$cell) {
if (cell %in% c("mESC", "mESC_serum", "mNPC")) {
tmp <- CTCFOrientationEnrichment(gr_sites = CTCF.sites[[cell]],
gr_lads = LADs[[cell]],
gr_borders = LAD.borders[[cell]],
gr_bins = bins.10kb.mouse,
cell = cell)
} else {
tmp <- CTCFOrientationEnrichment(gr_sites = CTCF.sites[[cell]],
gr_lads = LADs[[cell]],
gr_borders = LAD.borders[[cell]],
gr_bins = bins.10kb.human,
cell = cell)
}
CTCF.sites[[cell]] <- tmp[[1]]
LAD.borders[[cell]] <- tmp[[2]]
all_distances <- bind_rows(all_distances, tmp[[3]])
}
plt <- all_distances %>%
filter(original_cell != "mESC_serum") %>%
mutate(original_cell = factor(original_cell, levels(metadata$cell))) %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(size = 1, col = "red") +
facet_grid(original_cell ~ cell) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.13)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)plt <- all_distances %>%
filter(original_cell != "mESC_serum") %>%
mutate(original_cell = factor(original_cell, levels(metadata$cell))) %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(size = 1, col = "red") +
facet_grid(original_cell ~ cell) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.15, 0.15), ylim = c(0, 0.13)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)# 2) pA_DamID data
all_distances <- tibble()
for (cell in metadata.pa$cell) {
if (cell %in% c("mESC_pA_pT")) {
tmp <- CTCFOrientationEnrichment(gr_sites = CTCF.sites.pa[[cell]],
gr_lads = LADs.pa[[cell]],
gr_borders = LAD.borders.pa[[cell]],
gr_bins = bins.10kb.mouse,
cell = cell)
} else {
tmp <- CTCFOrientationEnrichment(gr_sites = CTCF.sites.pa[[cell]],
gr_lads = LADs.pa[[cell]],
gr_borders = LAD.borders.pa[[cell]],
gr_bins = bins.10kb.human,
cell = cell)
}
CTCF.sites.pa[[cell]] <- tmp[[1]]
LAD.borders.pa[[cell]] <- tmp[[2]]
all_distances <- bind_rows(all_distances, tmp[[3]])
}
plt <- all_distances %>%
mutate(original_cell = factor(original_cell, levels(metadata.pa$cell))) %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(size = 1, col = "red") +
facet_grid(original_cell ~ cell) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.13)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)plt <- all_distances %>%
mutate(original_cell = factor(original_cell, levels(metadata.pa$cell))) %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(size = 1, col = "red") +
facet_grid(original_cell ~ cell) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.15, 0.15), ylim = c(0, 0.13)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)Finally, I want to address whether the LAD borders that do not change over time are more or less enriched for CTCF sites. This is basically the same thing as cLADs and fLADs.
I will perform this analysis in two parts: for mouse and human.
However, I first need to define what I mean with “dynamic LAD borders”. These are LAD border that are different between the cell types. These can be LAD borders within and outside LADs (in the other cell type). To prevent small changes in the HMM calling from having a big effect, I will filter out borders that do not overlap but are close.
# Remove serum data
metadata.serum <- metadata[2, ]
LADs.serum <- LADs[2]
LAD.borders.serum <- LAD.borders[2]
CTCF.sites.serum <- CTCF.sites[2]
bins.10kb.serum <- bins.10kb[2]
bins.10kb.CTCF.serum <- bins.10kb.CTCF[2]
metadata <- metadata[-2, ]
LADs <- LADs[-2]
LAD.borders <- LAD.borders[-2]
CTCF.sites <- CTCF.sites[-2]
bins.10kb <- bins.10kb[-2]
bins.10kb.CTCF <- bins.10kb.CTCF[-2]
# Get unique borders
LAD.borders.mouse <- GetBorderDifferences(LAD.borders[c("mESC", "mNPC")])
LAD.borders.human <- GetBorderDifferences(LAD.borders[c("H1", "Hap1", "K562", "HCT116")])
LAD.borders <- c(LAD.borders.mouse, LAD.borders.human)
# Plot numbers
tib <- full_join(as_tibble(LAD.borders.mouse),
as_tibble(LAD.borders.human)) %>%
dplyr::select(group_name, CTCF, class) %>%
rename(group_name = "cell") %>%
mutate(cell = factor(cell, levels = levels(metadata$cell)))
tib %>%
ggplot(aes(x = cell, fill = class)) +
geom_bar(position = "dodge", col = "black") +
ggtitle("Shared and unique LAD borders") +
xlab(NULL) +
ylab("Count") +
scale_fill_grey() +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))Having classified borders as shared or unique between cell types, I can address the question whether there is an enrichment in CTCF binding at stable or dynamic borders.
# Plot with fraction CTCF
tib.summary <- tib %>%
group_by(cell, class) %>%
summarise(count = n(),
mean = mean(CTCF == "CTCF"))
tib.summary %>%
ggplot(aes(x = cell, y = mean, fill = class)) +
geom_bar(stat = "identity", position = "dodge", col = "black") +
ggtitle("Shared and unique LAD borders") +
xlab(NULL) +
ylab("Fraction CTCF borders") +
scale_fill_grey() +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))For all cell types tested, shared borders have a stronger enrichment of CTCF peaks than unique borders. The difference is small, but the overall enrichment of CTCF at borders was already small to begin with. Let’s repeat the enrichment plot, separating shared and unique borders.
# Update CTCF distances
CTCF.sites <- CTCFDistance(CTCF.sites, LAD.borders, border.class = T)
bins.10kb <- CTCFDistance(bins.10kb, LAD.borders, border.class = T)
bins.10kb.CTCF <- CTCFDistance(bins.10kb.CTCF, LAD.borders, border.class = T)
# Update CTCF counts
CTCF.distances <- CountPerBins(CTCF.sites, sample_name = "CTCF", border.class = T,
bin.size = bin.size)
bins.distances <- CountPerBins(bins.10kb, sample_name = "bins", border.class = T,
bin.size = bin.size)
bins.CTCF.distances <- CountPerBins(bins.10kb.CTCF, sample_name = "bins.CTCF",
border.class = T, bin.size = bin.size)
# Calculate ratios
distances <- CTCF.distances %>%
full_join(bins.distances) %>%
full_join(bins.CTCF.distances) %>%
rowwise() %>%
mutate(ratio = CTCF / bins,
ratio_withCTCF = bins.CTCF / bins) %>%
ungroup() %>%
filter(bins > 50)
# Plot the CTCF density for 10kb bins (100x 10kb = 1Mb)
distances %>%
ggplot(aes(x = value / scaling, y = ratio)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5, fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(col = "red", size = 1) +
facet_grid(cell ~ border.class) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("CTCF-bound sites per 10kb") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.58)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))# Plot the fraction of bins with CTCF peaks
distances %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(col = "red", size = 1) +
facet_grid(cell ~ border.class) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.38)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))Plot this in a single plot.
distances %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF, col = border.class)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(size = 1) +
facet_grid(cell ~ .) +
ggtitle("CTCF enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a CTCF peak") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.37)) +
scale_color_brewer(palette = "Set2", name = "Border class") +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))Also in this plot, you can consistently observe the (small) increase in CTCF sites at shared borders. Pretty intriguing.
Based on Max’s suggestion, let’s look at LAD borders +/- CTCF and active genes.
This is based on reprocessed RNAseq data for the 4DN project (see that project), RNAseq data generated for this project in mESC cells (PT clone) and RNAseq data from mNPC cells (Bonev, 2017). Alternatively, I can also use the Bonev data in mESC cells.
# Transcription data - list
tss_list <- list(mESC = tss_mouse[which(fpkm_mesc$WT_0h > 1)],
mNPC = tss_mouse[which(fpkm_mNPC$mNPC > 1)],
H1 = tss_human[which(fpkm_human$H1_expr > 1)],
Hap1 = tss_human[which(fpkm_human$Hap1_expr > 1)],
K562 = tss_human[which(fpkm_human$K562_expr > 1)],
HCT116 = tss_human[which(fpkm_human$HCT116_expr > 1)])
tes_list <- list(mESC = tes_mouse[which(fpkm_mesc$WT_0h > 1)],
mNPC = tes_mouse[which(fpkm_mNPC$mNPC > 1)],
H1 = tes_human[which(fpkm_human$H1_expr > 1)],
Hap1 = tes_human[which(fpkm_human$Hap1_expr > 1)],
K562 = tes_human[which(fpkm_human$K562_expr > 1)],
HCT116 = tes_human[which(fpkm_human$HCT116_expr > 1)])
# 1) Overlap with LAD border
for (cell in names(tss_list)) {
# Get the cell-specific data
cell.tss <- tss_list[[cell]]
cell.tes <- tes_list[[cell]]
cell.LADs <- LADs[[cell]]
# Add LAD overlap
cell.tss$which.LAD <- cell.tss$overlap.LAD <- NA
cell.tes$which.LAD <- cell.tes$overlap.LAD <- NA
dis <- as_tibble(distanceToNearest(cell.tss, cell.LADs))
cell.tss$which.LAD[dis$queryHits] <- dis$subjectHits
cell.tss$overlap.LAD[dis$queryHits] <- dis$distance == 0
dis <- as_tibble(distanceToNearest(cell.tes, cell.LADs))
cell.tes$which.LAD[dis$queryHits] <- dis$subjectHits
cell.tes$overlap.LAD[dis$queryHits] <- dis$distance == 0
# Save data
tss_list[[cell]] <- cell.tss
tes_list[[cell]] <- cell.tes
}
# 2) Distance to LAD borders
tss_list <- CTCFDistance(tss_list, LAD.borders)
tes_list <- CTCFDistance(tes_list, LAD.borders)
TranscriptionOrientationEnrichment <- function(gr_sites, gr_lads, gr_borders, gr_bins, cell) {
# Modified function of the CTCF motif strand enrichment:
# - removed "both" group
original_cell <- cell
# Get ctcf peaks
ctcf <- as_tibble(gr_sites) %>%
rowwise() %>%
mutate(distance = ifelse(overlap.LAD == T,
max(dis.precede, dis.follow, na.rm = T),
- max(dis.precede, dis.follow, na.rm = T)),
at_border = distance <= 0 & distance > -20e3,
border = case_when(overlap.LAD == T & is.na(dis.follow) ~ "right",
overlap.LAD == F & is.na(dis.precede) ~ "right",
T ~ "left")) %>%
ungroup()
# Update CTCF sites
gr_sites$distance <- ctcf$distance
gr_sites$border <- ctcf$border
gr_sites$at_border <- ctcf$at_border
# Which bin size should I use for this?
ctcf_bin_size <- bin.size
scaling = 1e6 / ctcf_bin_size
# Separate CTCF outwards and inwards of the LAD border
ctcf_outwards <- ctcf %>%
filter(motif_count > 0,
! motif_strand == "both",
! (border == "right" & motif_strand == "-"),
! (border == "left" & motif_strand == "+")) %>%
mutate(cell = paste0(cell, "_outwards"))
ctcf_inwards <- ctcf %>%
filter(motif_count > 0,
! motif_strand == "both",
! (border == "left" & motif_strand == "-"),
! (border == "right" & motif_strand == "+")) %>%
mutate(cell = paste0(cell, "_inwards"))
ctcf <- ctcf %>%
mutate(motif_orientation = case_when((motif_count > 0 &
! (border == "right" & motif_strand == "-") &
! (border == "left" & motif_strand == "+")) ~ "mESC_outwards",
(motif_count > 0 &
! (border == "left" & motif_strand == "-") &
! (border == "right" & motif_strand == "+")) ~ "mESC_inwards",
T ~ paste0(cell, "_nostrand")))
gr_ctcf <- as(ctcf, "GRanges")
# Prepare lists for functions made previously
ctcf_list <- GRangesList(outwards = as(ctcf_outwards, "GRanges"),
inwards = as(ctcf_inwards, "GRanges"))
border_list <- GRangesList(outwards = gr_borders,
inwards = gr_borders)
bins_10kb <- GRangesList(outwards = gr_bins,
inwards = gr_bins)
LAD_list <- GRangesList(outwards = gr_lads,
inwards = gr_lads)
# Also select bins overlapping a CTCF site for a different perspective
bins_10kb_CTCF <- bins_10kb
for (cell in names(bins_10kb_CTCF)) {
cell_bins <- bins_10kb_CTCF[[cell]]
cell_CTCF <- grMid(ctcf_list[[cell]])
cell_bins <- cell_bins[overlapsAny(cell_bins, cell_CTCF)]
bins_10kb_CTCF[[cell]] <- cell_bins
}
# Distance to LAD border
ctcf_list <- CTCFDistance(ctcf_list, border_list)
bins_10kb <- CTCFDistance(bins_10kb, border_list)
bins_10kb_CTCF <- CTCFDistance(bins_10kb_CTCF, border_list)
# Count occurences
cell_distances <- CountPerBins(ctcf_list, sample_name = "CTCF", bin.size = ctcf_bin_size)
bins_distances <- CountPerBins(bins_10kb, sample_name = "bins", bin.size = ctcf_bin_size)
bins_CTCF_distances <- CountPerBins(bins_10kb_CTCF, sample_name = "bins_CTCF", bin.size = ctcf_bin_size)
# Calculate ratios
distances <- cell_distances %>%
full_join(bins_distances) %>%
full_join(bins_CTCF_distances) %>%
rowwise() %>%
mutate(ratio = CTCF / bins,
ratio_withCTCF = bins_CTCF / bins,
original_cell = original_cell) %>%
ungroup() %>%
filter(bins > 50,
cell != "both")
# Return objects
distances
}
# 1) DamID data
all_distances <- tibble()
for (cell in metadata$cell) {
if (cell %in% c("mESC", "mESC_serum", "mNPC")) {
tmp_tss <- TranscriptionOrientationEnrichment(gr_sites = tss_list[[cell]],
gr_lads = LADs[[cell]],
gr_borders = LAD.borders[[cell]],
gr_bins = bins.10kb.mouse,
cell = cell) %>%
mutate(class = "tss")
tmp_tes <- TranscriptionOrientationEnrichment(gr_sites = tes_list[[cell]],
gr_lads = LADs[[cell]],
gr_borders = LAD.borders[[cell]],
gr_bins = bins.10kb.mouse,
cell = cell) %>%
mutate(class = "tes")
} else {
tmp_tss <- TranscriptionOrientationEnrichment(gr_sites = tss_list[[cell]],
gr_lads = LADs[[cell]],
gr_borders = LAD.borders[[cell]],
gr_bins = bins.10kb.human,
cell = cell) %>%
mutate(class = "tss")
tmp_tes <- TranscriptionOrientationEnrichment(gr_sites = tes_list[[cell]],
gr_lads = LADs[[cell]],
gr_borders = LAD.borders[[cell]],
gr_bins = bins.10kb.human,
cell = cell) %>%
mutate(class = "tes")
}
all_distances <- bind_rows(all_distances, tmp_tss, tmp_tes)
}
plt <- all_distances %>%
mutate(original_cell = factor(original_cell, levels(metadata$cell)),
class = factor(class, c("tss", "tes"))) %>%
ggplot(aes(x = value / scaling, y = ratio_withCTCF, col = cell)) +
annotate("rect", xmin = 0, xmax = 1, ymin = -1, ymax = 5,
fill = "grey", alpha = 0.3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
geom_line(size = 1) +
facet_grid(original_cell ~ class) +
scale_color_manual(values = c("green3", "brown")) +
#palette = "Dark2") +
ggtitle("Active gene enrichment at LAD borders") +
xlab("Distance from LAD border (Mb)") +
ylab("Fraction 10kb bins overlapping a TSS/TES") +
coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 0.09)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)Next, I will determine the numbers of the segmented LAD borders based on CTCF binding, CTCF strand and positioning of active genes.
# Convert LAD borders to tibble
tib <- as_tibble(unlist(LAD.borders)) %>%
mutate(strand = as.character(strand),
strand_of_gene = as.character(strand_of_gene),
gene_orientation = case_when(strand_of_gene == "ambiguous" ~ "ambiguous",
strand_of_gene == strand ~ "inwards",
strand_of_gene != strand ~ "outwards",
T ~ "-"))
# Plot fraction of borders overlapping with active genes
tib %>%
group_by(cell, CTCF) %>%
dplyr::summarise(n = n(),
all = mean(ovl_gene),
inwards = mean(gene_orientation == "inwards"),
outwards = mean(gene_orientation == "outwards")) %>%
gather(key, value, all, inwards, outwards) %>%
mutate(cell = factor(cell, levels(metadata$cell)),
key = factor(key, rev(c("all", "inwards", "outwards")))) %>%
ggplot(aes(x = key, y = value, fill = CTCF)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_grey() +
coord_flip() +
ylab("Fraction borders overlapping gene") +
xlab("Gene orientation") +
facet_wrap(. ~ cell, ncol = 1) +
theme_classic() +
theme(aspect.ratio = 1)# Conclusion: there is no enrichment of (active) gene orientation at LAD borders
# Plot expression of (active) genes overlapping borders
tib %>%
filter(gene_orientation %in% c("inwards", "outwards")) %>%
mutate(cell = factor(cell, levels(metadata$cell))) %>%
ggplot(aes(x = interaction(gene_orientation, CTCF), y = log2(max_gene_expression+1))) +
geom_quasirandom() +
geom_boxplot(outlier.shape = NA, col = "red", fill = NA) +
coord_flip() +
ylab("Expression (log2)") +
xlab("Border class") +
facet_wrap(. ~ cell, ncol = 1, scales = "free") +
theme_classic() +
theme(aspect.ratio = 1)# Conclusion: there is no difference in gene expression based on gene strand
# Plot border classification - with filtered borders with overlapping genes
tib %>%
group_by(cell, CTCF, ovl_gene) %>%
dplyr::summarise(n = n()) %>%
mutate(cell = factor(cell, levels(metadata$cell))) %>%
ggplot(aes(x = CTCF, fill = ovl_gene, y = n)) +
geom_bar(stat = "identity") +
xlab("Border class") +
ylab("Count") +
scale_fill_grey() +
coord_flip() +
facet_wrap(cell ~ ., ncol = 1) +
theme_classic() +
theme(aspect.ratio = 1/2)# I want the numbers for the manuscript text
tib %>%
group_by(cell, CTCF, ovl_gene) %>%
dplyr::summarise(n = n()) %>%
spread(ovl_gene, n) %>%
mutate(total = `TRUE` + `FALSE`,
fraction = `FALSE` / (`TRUE` + `FALSE`)) %>%
print(n = 40)## # A tibble: 12 × 6
## # Groups: cell, CTCF [12]
## cell CTCF `FALSE` `TRUE` total fraction
## <chr> <chr> <int> <int> <int> <dbl>
## 1 H1 CTCF 852 201 1053 0.809
## 2 H1 nonCTCF 734 219 953 0.770
## 3 Hap1 CTCF 1271 206 1477 0.861
## 4 Hap1 nonCTCF 1489 297 1786 0.834
## 5 HCT116 CTCF 1295 295 1590 0.814
## 6 HCT116 nonCTCF 752 229 981 0.767
## 7 K562 CTCF 812 138 950 0.855
## 8 K562 nonCTCF 1056 250 1306 0.809
## 9 mESC CTCF 751 290 1041 0.721
## 10 mESC nonCTCF 786 415 1201 0.654
## 11 mNPC CTCF 941 518 1459 0.645
## 12 mNPC nonCTCF 972 588 1560 0.623
# Repeat, by strand
tib %>%
group_by(cell, CTCF_strand, ovl_gene) %>%
dplyr::summarise(n = n()) %>%
mutate(cell = factor(cell, levels(metadata$cell)),
CTCF_strand = factor(CTCF_strand, c("outwards", "inwards", "ambiguous", "nonCTCF"))) %>%
ggplot(aes(x = CTCF_strand, fill = ovl_gene, y = n)) +
geom_bar(stat = "identity") +
xlab("Border class") +
ylab("Count") +
scale_fill_grey() +
coord_flip() +
facet_wrap(cell ~ ., ncol = 1) +
theme_classic() +
theme(aspect.ratio = 3/4)# I want the numbers for the manuscript text
tib %>%
group_by(cell, CTCF_strand, ovl_gene) %>%
dplyr::summarise(n = n()) %>%
spread(ovl_gene, n) %>%
mutate(total = `TRUE` + `FALSE`,
fraction = `FALSE` / (`TRUE` + `FALSE`)) %>%
print(n = 40)## # A tibble: 24 × 6
## # Groups: cell, CTCF_strand [24]
## cell CTCF_strand `FALSE` `TRUE` total fraction
## <chr> <chr> <int> <int> <int> <dbl>
## 1 H1 ambiguous 540 128 668 0.808
## 2 H1 inwards 169 45 214 0.790
## 3 H1 nonCTCF 734 219 953 0.770
## 4 H1 outwards 143 28 171 0.836
## 5 Hap1 ambiguous 636 116 752 0.846
## 6 Hap1 inwards 344 64 408 0.843
## 7 Hap1 nonCTCF 1489 297 1786 0.834
## 8 Hap1 outwards 291 26 317 0.918
## 9 HCT116 ambiguous 778 181 959 0.811
## 10 HCT116 inwards 315 74 389 0.810
## 11 HCT116 nonCTCF 752 229 981 0.767
## 12 HCT116 outwards 202 40 242 0.835
## 13 K562 ambiguous 431 72 503 0.857
## 14 K562 inwards 223 39 262 0.851
## 15 K562 nonCTCF 1056 250 1306 0.809
## 16 K562 outwards 158 27 185 0.854
## 17 mESC ambiguous 319 132 451 0.707
## 18 mESC inwards 255 89 344 0.741
## 19 mESC nonCTCF 786 415 1201 0.654
## 20 mESC outwards 177 69 246 0.720
## 21 mNPC ambiguous 402 243 645 0.623
## 22 mNPC inwards 285 152 437 0.652
## 23 mNPC nonCTCF 972 588 1560 0.623
## 24 mNPC outwards 254 123 377 0.674
Repeat this for the pA-DamID data.
# Convert LAD borders to tibble
tib <- as_tibble(unlist(LAD.borders.pa)) %>%
mutate(strand = as.character(strand),
strand_of_gene = as.character(strand_of_gene),
gene_orientation = case_when(strand_of_gene == "ambiguous" ~ "ambiguous",
strand_of_gene == strand ~ "inwards",
strand_of_gene != strand ~ "outwards",
T ~ "-"))
# Plot fraction of borders overlapping with active genes
tib %>%
group_by(cell, CTCF) %>%
dplyr::summarise(n = n(),
all = mean(ovl_gene),
inwards = mean(gene_orientation == "inwards"),
outwards = mean(gene_orientation == "outwards")) %>%
gather(key, value, all, inwards, outwards) %>%
mutate(cell = factor(cell, levels(metadata$cell)),
key = factor(key, rev(c("all", "inwards", "outwards")))) %>%
ggplot(aes(x = key, y = value, fill = CTCF)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_grey() +
coord_flip() +
ylab("Fraction borders overlapping gene") +
xlab("Gene orientation") +
facet_wrap(. ~ cell, ncol = 1) +
theme_classic() +
theme(aspect.ratio = 1)# Plot expression of (active) genes overlapping borders
tib %>%
filter(gene_orientation %in% c("inwards", "outwards")) %>%
mutate(cell = factor(cell, levels(metadata.pa$cell))) %>%
ggplot(aes(x = interaction(gene_orientation, CTCF), y = log2(max_gene_expression+1))) +
geom_quasirandom() +
geom_boxplot(outlier.shape = NA, col = "red", fill = NA) +
coord_flip() +
ylab("Expression (log2)") +
xlab("Border class") +
facet_wrap(. ~ cell, ncol = 1, scales = "free") +
theme_classic() +
theme(aspect.ratio = 1)# Plot border classification - with filtered borders with overlapping genes
tib %>%
group_by(cell, CTCF, ovl_gene) %>%
dplyr::summarise(n = n()) %>%
mutate(cell = factor(cell, levels(metadata.pa$cell))) %>%
ggplot(aes(x = CTCF, fill = ovl_gene, y = n)) +
geom_bar(stat = "identity") +
xlab("Border class") +
ylab("Count") +
scale_fill_grey() +
coord_flip() +
facet_wrap(cell ~ ., ncol = 1) +
theme_classic() +
theme(aspect.ratio = 1/2)# I want the numbers for the manuscript text
tib %>%
group_by(cell, CTCF, ovl_gene) %>%
dplyr::summarise(n = n()) %>%
spread(ovl_gene, n) %>%
mutate(total = `TRUE` + `FALSE`,
fraction = `FALSE` / (`TRUE` + `FALSE`)) %>%
print(n = 40)## # A tibble: 8 × 6
## # Groups: cell, CTCF [8]
## cell CTCF `FALSE` `TRUE` total fraction
## <chr> <chr> <int> <int> <int> <dbl>
## 1 Hap1_pA CTCF 738 202 940 0.785
## 2 Hap1_pA nonCTCF 839 219 1058 0.793
## 3 HCT116_pA CTCF 948 398 1346 0.704
## 4 HCT116_pA nonCTCF 882 369 1251 0.705
## 5 K562_pA CTCF 682 183 865 0.788
## 6 K562_pA nonCTCF 680 252 932 0.730
## 7 mESC_pA_PT CTCF 429 160 589 0.728
## 8 mESC_pA_PT nonCTCF 579 210 789 0.734
tib %>%
group_by(cell, CTCF_strand, ovl_gene) %>%
dplyr::summarise(n = n()) %>%
mutate(cell = factor(cell, levels(metadata.pa$cell)),
CTCF_strand = factor(CTCF_strand, c("outwards", "inwards", "ambiguous", "nonCTCF"))) %>%
ggplot(aes(x = CTCF_strand, fill = ovl_gene, y = n)) +
geom_bar(stat = "identity") +
xlab("Border class") +
ylab("Count") +
scale_fill_grey() +
coord_flip() +
facet_wrap(cell ~ ., ncol = 1) +
theme_classic() +
theme(aspect.ratio = 3/4)# I want the numbers for the manuscript text
tib %>%
group_by(cell, CTCF_strand, ovl_gene) %>%
dplyr::summarise(n = n()) %>%
spread(ovl_gene, n) %>%
mutate(total = `TRUE` + `FALSE`,
fraction = `FALSE` / (`TRUE` + `FALSE`)) %>%
print(n = 40)## # A tibble: 16 × 6
## # Groups: cell, CTCF_strand [16]
## cell CTCF_strand `FALSE` `TRUE` total fraction
## <chr> <chr> <int> <int> <int> <dbl>
## 1 Hap1_pA ambiguous 375 108 483 0.776
## 2 Hap1_pA inwards 215 60 275 0.782
## 3 Hap1_pA nonCTCF 839 219 1058 0.793
## 4 Hap1_pA outwards 148 34 182 0.813
## 5 HCT116_pA ambiguous 609 247 856 0.711
## 6 HCT116_pA inwards 199 93 292 0.682
## 7 HCT116_pA nonCTCF 882 369 1251 0.705
## 8 HCT116_pA outwards 140 58 198 0.707
## 9 K562_pA ambiguous 363 106 469 0.774
## 10 K562_pA inwards 194 38 232 0.836
## 11 K562_pA nonCTCF 680 252 932 0.730
## 12 K562_pA outwards 125 39 164 0.762
## 13 mESC_pA_PT ambiguous 157 74 231 0.680
## 14 mESC_pA_PT inwards 178 52 230 0.774
## 15 mESC_pA_PT nonCTCF 579 210 789 0.734
## 16 mESC_pA_PT outwards 94 34 128 0.734
Conclusions:
How does this compare with Guelen, 2008?
2688 LAD borders, of which 363 with oriented promoters and 365 with CTCF. Expected by change = 49 borders with both. Result = 65 borders with both. In other words, there is no depletion of these two marks combined.
Finally, save the relevant R objects for later use.
# Add serum again
metadata <- bind_rows(metadata, metadata.serum)
LADs <- c(LADs, LADs.serum)
LAD.borders <- c(LAD.borders, LAD.borders.serum)
CTCF.sites <- c(CTCF.sites, CTCF.sites.serum)
bins.10kb <- c(bins.10kb, bins.10kb.serum)
bins.10kb.CTCF <- c(bins.10kb.CTCF, bins.10kb.CTCF.serum)
# Save objects
saveRDS(metadata,
file.path(output_dir, "metadata.rds"))
saveRDS(LADs,
file.path(output_dir, "LADs.rds"))
saveRDS(LAD.borders,
file.path(output_dir, "LAD_borders.rds"))
saveRDS(CTCF.sites,
file.path(output_dir, "CTCF_sites.rds"))
saveRDS(LADs.pa,
file.path(output_dir, "LADs_pA.rds"))
saveRDS(LAD.borders.pa,
file.path(output_dir, "LAD_borders_pA.rds"))
saveRDS(CTCF.sites.pa,
file.path(output_dir, "CTCF_sites_pA.rds"))
# Save LADs and LAD borders for NQ
write_tsv(as_tibble(LADs[["mESC"]]),
file.path(output_dir, "LADs_mESC_2i.txt"))
write_tsv(as_tibble(LAD.borders[["mESC"]]),
file.path(output_dir, "LAD_borders_mESC_2i.txt"))
write_tsv(as_tibble(LADs[["mESC_serum"]]),
file.path(output_dir, "LADs_mESC_serum.txt"))
write_tsv(as_tibble(LAD.borders[["mESC_serum"]]),
file.path(output_dir, "LAD_borders_mESC_serum.txt"))
export.bed(as_tibble(LADs[["mESC"]]),
file.path(output_dir, "LADs_mESC_2i.bed"))
export.bed(as_tibble(LAD.borders[["mESC"]][LAD.borders[["mESC"]]$CTCF_strand == "nonCTCF"]),
file.path(output_dir, "LAD_borders_mESC_2i_nonCTCF.bed"))
export.bed(as_tibble(LAD.borders[["mESC"]][LAD.borders[["mESC"]]$CTCF_strand == "inwards"]),
file.path(output_dir, "LAD_borders_mESC_2i_inwards.bed"))
export.bed(as_tibble(LAD.borders[["mESC"]][LAD.borders[["mESC"]]$CTCF_strand == "outwards"]),
file.path(output_dir, "LAD_borders_mESC_2i_outwards.bed"))
export.bed(as_tibble(LAD.borders[["mESC"]][LAD.borders[["mESC"]]$CTCF_strand == "ambiguous"]),
file.path(output_dir, "LAD_borders_mESC_2i_ambiguous.bed"))
export.bed(as_tibble(LADs.pa[["mESC_pA_PT"]]),
file.path(output_dir, "LADs_mESC_2i_pA.bed"))
export.bed(as_tibble(LAD.borders.pa[["mESC_pA_PT"]][LAD.borders.pa[["mESC_pA_PT"]]$CTCF_strand == "nonCTCF"]),
file.path(output_dir, "LAD_borders_mESC_2i_pA_nonCTCF.bed"))
export.bed(as_tibble(LAD.borders.pa[["mESC_pA_PT"]][LAD.borders.pa[["mESC_pA_PT"]]$CTCF_strand == "inwards"]),
file.path(output_dir, "LAD_borders_mESC_2i_pA_inwards.bed"))
export.bed(as_tibble(LAD.borders.pa[["mESC_pA_PT"]][LAD.borders.pa[["mESC_pA_PT"]]$CTCF_strand == "outwards"]),
file.path(output_dir, "LAD_borders_mESC_2i_pA_outwards.bed"))
export.bed(as_tibble(LAD.borders.pa[["mESC_pA_PT"]][LAD.borders.pa[["mESC_pA_PT"]]$CTCF_strand == "ambiguous"]),
file.path(output_dir, "LAD_borders_mESC_2i_pA_ambiguous.bed"))CTCF sites are enriched at LAD borders. In some cell lines, this enrichment is more obvious than in others. mESC is one of the cell lines were the effect seems less obvious.
I hope that this enrichment is still strong enough to design any follow up experiment on.
sessionInfo()## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] knitr_1.33 ggbeeswarm_0.6.0 rtracklayer_1.50.0
## [4] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7 IRanges_2.24.1
## [7] S4Vectors_0.28.1 BiocGenerics_0.36.1 forcats_0.5.1
## [10] stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4
## [13] readr_2.0.0 tidyr_1.1.3 tibble_3.1.6
## [16] ggplot2_3.3.5 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 matrixStats_0.60.0
## [3] fs_1.5.0 bit64_4.0.5
## [5] lubridate_1.7.10 httr_1.4.2
## [7] tools_4.0.5 backports_1.2.1
## [9] bslib_0.2.5.1 utf8_1.2.2
## [11] R6_2.5.1 vipor_0.4.5
## [13] DBI_1.1.1 colorspace_2.0-2
## [15] withr_2.4.2 tidyselect_1.1.1
## [17] bit_4.0.4 compiler_4.0.5
## [19] cli_3.1.0 rvest_1.0.1
## [21] Biobase_2.50.0 xml2_1.3.2
## [23] DelayedArray_0.16.3 labeling_0.4.2
## [25] sass_0.4.0 scales_1.1.1
## [27] digest_0.6.28 Rsamtools_2.6.0
## [29] rmarkdown_2.11 XVector_0.30.0
## [31] pkgconfig_2.0.3 htmltools_0.5.1.1
## [33] MatrixGenerics_1.2.1 highr_0.9
## [35] dbplyr_2.1.1 rlang_0.4.12
## [37] readxl_1.3.1 rstudioapi_0.13
## [39] farver_2.1.0 jquerylib_0.1.4
## [41] generics_0.1.0 jsonlite_1.7.2
## [43] vroom_1.5.3 BiocParallel_1.24.1
## [45] RCurl_1.98-1.3 magrittr_2.0.1
## [47] GenomeInfoDbData_1.2.4 Matrix_1.3-4
## [49] Rcpp_1.0.7 munsell_0.5.0
## [51] fansi_0.5.0 lifecycle_1.0.1
## [53] stringi_1.7.3 yaml_2.2.1
## [55] SummarizedExperiment_1.20.0 zlibbioc_1.36.0
## [57] grid_4.0.5 crayon_1.4.2
## [59] lattice_0.20-44 Biostrings_2.58.0
## [61] haven_2.4.1 hms_1.1.0
## [63] pillar_1.6.4 codetools_0.2-18
## [65] reprex_2.0.0 XML_3.99-0.6
## [67] glue_1.5.0 evaluate_0.14
## [69] modelr_0.1.8 vctrs_0.3.8
## [71] tzdb_0.1.2 cellranger_1.1.0
## [73] gtable_0.3.0 assertthat_0.2.1
## [75] xfun_0.24 broom_0.7.9
## [77] GenomicAlignments_1.26.0 beeswarm_0.4.0
## [79] ellipsis_0.3.2